{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据探索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "训练集数据长度：14000，测试集数据长度：6000\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>X1</th>\n",
       "      <th>X2</th>\n",
       "      <th>X3</th>\n",
       "      <th>X4</th>\n",
       "      <th>X5</th>\n",
       "      <th>cls</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>318</th>\n",
       "      <td>0.255582</td>\n",
       "      <td>-0.578199</td>\n",
       "      <td>-0.504863</td>\n",
       "      <td>-0.963650</td>\n",
       "      <td>-1.015626</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1513</th>\n",
       "      <td>-1.523422</td>\n",
       "      <td>1.583999</td>\n",
       "      <td>0.094002</td>\n",
       "      <td>-2.276259</td>\n",
       "      <td>-1.293283</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10025</th>\n",
       "      <td>1.099701</td>\n",
       "      <td>-1.785462</td>\n",
       "      <td>1.767452</td>\n",
       "      <td>-1.121677</td>\n",
       "      <td>-1.599187</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             X1        X2        X3        X4        X5  cls\n",
       "318    0.255582 -0.578199 -0.504863 -0.963650 -1.015626    0\n",
       "1513  -1.523422  1.583999  0.094002 -2.276259 -1.293283    0\n",
       "10025  1.099701 -1.785462  1.767452 -1.121677 -1.599187    0"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 为了方便叙述建模流程，这里准备了两个脱敏数据集：一个训练集一个测试集\n",
    "train = pd.read_csv('imb_train.csv')\n",
    "test = pd.read_csv('imb_test.csv')\n",
    "\n",
    "print(f'训练集数据长度：{len(train)}，测试集数据长度：{len(test)}')\n",
    "train.sample(3)\n",
    "\n",
    "# 参数解释\n",
    "## X1 ~ X5：自变量，cls：因变量 care life of science - 科学关爱生命 0-不得病，1-得病"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "训练集中，因变量 cls 分类情况：\n",
      "                  0    1\n",
      "value_counts  13644  356\n",
      "=======================================================\n",
      "\n",
      "测试集中，因变量 cls 分类情况：\n",
      "                 0    1\n",
      "value_counts  5848  152\n"
     ]
    }
   ],
   "source": [
    "# 查看测试集与训练集的因变量分类情况\n",
    "print('训练集中，因变量 cls 分类情况：')\n",
    "print(train['cls'].agg(['value_counts']).T)\n",
    "print('='*55 + '\\n')\n",
    "\n",
    "print('测试集中，因变量 cls 分类情况：')\n",
    "print(test['cls'].agg(['value_counts']).T)\n",
    "# 可知训练集和测试集中的占比少的类别 1 实在是太少了，比较严重的不平衡"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "训练集中因变量 cls 分类情况：Counter({0: 13644, 1: 356})\n",
      "测试集因变量 cls 分类情况：Counter({0: 5848, 1: 152})\n"
     ]
    }
   ],
   "source": [
    "# 另一种探索方法，直接使用 collection 库里面的 Counter 函数\n",
    "from collections import Counter\n",
    "print('训练集中因变量 cls 分类情况：{}'.format(Counter(train['cls'])))\n",
    "print('测试集因变量 cls 分类情况：{}'.format(Counter(test['cls'])))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 不同的抽样方法对训练集进行处理\n",
    "测试集是不做任何处理的！！保留严峻的比例考验来测试模型。\n",
    "\n",
    "训练模型时用到的数据才是经过处理的，0-1 比例再 1：1 ~ 1：10 之间"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 拆分因变量与自变量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train = train['cls'];        y_test = test['cls']\n",
    "X_train = train.loc[:, :'X5'];  X_test = test.loc[:, :'X5']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(            X1        X2        X3       X4        X5\n",
       " 9382 -1.191287  1.363136 -0.705131 -1.24394 -0.520264, 0    0\n",
       " Name: cls, dtype: int64)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.sample(), y_train[:1] "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 抽样的几种方法\n",
    "- Random Over Sampling：随机过抽样\n",
    "- SMOTE 方法过抽样\n",
    "- SMOTETomek 综合抽样"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "不经过任何采样处理的原始 y_train 中的分类情况：Counter({0: 13644, 1: 356})\n",
      "随机过采样后，训练集 y_ros 中的分类情况：Counter({0: 13644, 1: 13644})\n",
      "SMOTE过采样后，训练集 y_sos 中的分类情况：Counter({0: 13644, 1: 13644})\n",
      "综合采样后，训练集 y_kos 中的分类情况：Counter({0: 13395, 1: 13395})\n"
     ]
    }
   ],
   "source": [
    "# imblearn：imbalance learning 这个包 pip install imblearn 安装一下即可\n",
    "from imblearn.over_sampling import RandomOverSampler\n",
    "\n",
    "print('不经过任何采样处理的原始 y_train 中的分类情况：{}'.format(Counter(y_train)))\n",
    "\n",
    "# 采样策略 sampling_strategy = 'auto' 的 auto 默认抽成 1：1，\n",
    " ## 如果想要另外的比例如杰克所说的 1：5，甚至底线 1:10，需要根据文档自行调整参数\n",
    " ## 文档：https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.RandomOverSampler.html\n",
    "# 先定义好好，未开始正式训练拟合\n",
    "ros = RandomOverSampler(random_state=0, sampling_strategy='auto') \n",
    "X_ros, y_ros = ros.fit_sample(X_train, y_train)\n",
    "print('随机过采样后，训练集 y_ros 中的分类情况：{}'.format(Counter(y_ros)))\n",
    "\n",
    "# 同理，SMOTE 的步骤也是如此\n",
    "from imblearn.over_sampling import SMOTE\n",
    "sos = SMOTE(random_state=0)\n",
    "X_sos, y_sos = sos.fit_sample(X_train, y_train)\n",
    "print('SMOTE过采样后，训练集 y_sos 中的分类情况：{}'.format(Counter(y_sos)))\n",
    "\n",
    "# 同理，综合采样（先过采样再欠采样）\n",
    "## # combine 表示组合抽样，所以 SMOTE 与 Tomek 这两个英文单词写在了一起\n",
    "from imblearn.combine import SMOTETomek\n",
    "kos = SMOTETomek(random_state=0)  # 综合采样\n",
    "X_kos, y_kos = kos.fit_sample(X_train, y_train)\n",
    "print('综合采样后，训练集 y_kos 中的分类情况：{}'.format(Counter(y_kos)))\n",
    "# 不难两种过采样方法都将原来 y_train 中的占比少的分类 1 提到了与 0 数量一致的情况\n",
    "# 但因为综合采样在过采样后会使用欠采样，所以数量会稍微少一点点"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 决策树建模"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn import metrics\n",
    "from sklearn.model_selection import GridSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 创建了决策树类，还并没有正式开始训练模型\n",
    "clf = DecisionTreeClassifier(criterion='gini', random_state=1234)\n",
    "# 梯度优化\n",
    "param_grid = {'max_depth':[3, 4, 5, 6], 'max_leaf_nodes':[4, 6, 8, 10, 12]}\n",
    "# cv 表示是创建一个类，还并没有开始训练模型\n",
    "cv = GridSearchCV(clf, param_grid=param_grid, scoring='f1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 注意！！！！！！！！\n",
    "## 这里的数据使用大有玄机\n",
    "## 第一组数据 X，y_train 是没有经过任何操作的\n",
    "## 第二组 ros 为随机过采样，第三组 sos 为 SMOTE 过采样，\n",
    "## 最后一组 kos 则为综合采样\n",
    "data = [[X_train, y_train],\n",
    "        [X_ros, y_ros],\n",
    "        [X_sos, y_sos],\n",
    "        [X_kos, y_kos]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "auc:0.747 recall:0.493 precision:0.987\n",
      "auc:0.824 recall:0.783 precision:0.132\n",
      "auc:0.819 recall:0.757 precision:0.143\n",
      "auc:0.819 recall:0.757 precision:0.142\n"
     ]
    }
   ],
   "source": [
    "for features, labels in data:\n",
    "    cv.fit(features, labels) # 对四组数据分别做模型\n",
    "    # 注意：X_test 是从来没被动过的，回应了理论知识：\n",
    "     ## 使用比例优良的(1:1~1:10)训练集来训练模型，用残酷的(分类为1的仅有2%)测试集来考验模型\n",
    "    predict_test = cv.predict(X_test) \n",
    "# 其实 recall 和 precision 的用处都不大，看 auc 即可\n",
    "## recall：覆盖率，预测出分类为 0 且正确的，但本来数据集中分类为 0 的占比本来就很大\n",
    "## 而且 recall 是以阈值为 0.5 来计算的，那我们就可以简单的认为预测的欺诈概率大于 0.5 就算欺诈了吗？还是说如果他的潜在欺诈概率只要超过 20% 就已经算为欺诈了呢？\n",
    "## 另外：阈值也不是 0.01，正确确定阈值的方法应该是结合先验概率(1%)，在 1% ~ 50% 之间，根据公式\n",
    "    print('auc:%.3f' %metrics.roc_auc_score(y_test, predict_test), \n",
    "          'recall:%.3f' %metrics.recall_score(y_test, predict_test),\n",
    "          'precision:%.3f' %metrics.precision_score(y_test, predict_test))\n",
    "    # 发现并不一定是综合采样就一定高分，毕竟每份数据集都有属于它自己的特征\n",
    "    ## 不过一点都不处理的模型的 auc 是最低的"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 采用权重法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "auc:0.806 recall:0.618 precision:0.740\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'class_weight': {0: 1, 1: 5}, 'max_depth': 4, 'max_leaf_nodes': 12}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#2、采用改变样本权重的方法\n",
    "param_grid2 = {'max_depth':[3, 4, 5, 6], \n",
    "               'max_leaf_nodes':[4, 6, 8, 10, 12], \n",
    "               'class_weight':[{0:1, 1:5}, {0:1, 1:10}, {0:1, 1:15}]}\n",
    "# y=0:y=1 = 1:5/1:1-/1:15，逐个尝试，但需要注意的是，\n",
    " ## 这个权重比例并不是简单的将占比少的 y=1 的数据✖5，\n",
    " ## 而是在计算模型精度的时候这个权重才派上用场，即如果你预测错一个 y=1 的，\n",
    " ## 就算预测错 5/10/15 个\n",
    "# clf 是已经定义好的决策树\n",
    "cv2 = GridSearchCV(clf, param_grid=param_grid2, scoring='f1')\n",
    "\n",
    "# 当使用权重法的时候是需要在原始的数据集上使用\n",
    "cv2.fit(X_train, y_train)\n",
    "predict_test2 = cv2.predict(X_test)\n",
    "\n",
    "print('auc:%.3f' %metrics.roc_auc_score(y_test, predict_test2),\n",
    "      'recall:%.3f' %metrics.recall_score(y_test, predict_test2),\n",
    "      'precision:%.3f' %metrics.precision_score(y_test, predict_test2))\n",
    "# 肯定比不做任何操作要好好，但不够使用算法来进行采样处理好。\n",
    "\n",
    "cv2.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
